import re

from pyspark.sql import SparkSession

if __name__ == '__main__':
    spark = SparkSession.builder.appName("word_count").master("local[*]").getOrCreate()
    rdd = spark.sparkContext.textFile("../../data/chinese*.txt")
    count_rdd = rdd.flatMap(lambda line: re.findall(r'[\u4e00-\u9fa5]', line))\
        .map(lambda s: (s, 1)).groupByKey().mapValues(sum).sortBy(lambda words: words[1], False)
    count_rdd.foreach(lambda wc: print(wc))
    count_rdd.repartition(1).saveAsTextFile("../../data/wc/")
    spark.stop()
